Dependencies for this notebook:

pip install spacy, pandas, matplotlib
python -m spacy.en.download



In [1]:

    
from IPython.display import SVG, display
import spacy
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
%matplotlib inline



In [2]:

    
#encode some text as uncode
text = u"I'm executing this code on an Apple Computer."

#instantiate a language model
#to download language model: python -m spacy.en.download 
nlp = spacy.load('en') # or spacy.en.English()

#create a document
document = nlp(text)



In [3]:

    
for function in nlp.pipeline:
    print(function)









    



<spacy.tagger.Tagger object at 0x116f18510>
<spacy.pipeline.DependencyParser object at 0x110759f48>
<spacy.matcher.Matcher object at 0x110743978>
<spacy.pipeline.EntityRecognizer object at 0x116f9d188>



In [4]:

    
### Modifying the Language Model
def identify_starwars(doc):
    for token in doc:
        if token.text == u'starwars':
            token.tag_ = u'NNP'

def return_pipeline(nlp):
    return [nlp.tagger, nlp.parser, nlp.matcher, nlp.entity, identify_starwars]

text = u"I loved all of the starwars movies"
custom_nlp = spacy.load('en', create_pipeline=return_pipeline)
new_document = custom_nlp(text)

for function in custom_nlp.pipeline:
    print(function)









    



<spacy.tagger.Tagger object at 0x11968b5a0>
<spacy.pipeline.DependencyParser object at 0x110759e08>
<spacy.matcher.Matcher object at 0x1107437b8>
<spacy.pipeline.EntityRecognizer object at 0x116f9d368>
<function identify_starwars at 0x11075e488>



In [5]:

    
texts = [u'You have brains in your head.'] * 10000

for doc in nlp.pipe(texts,n_threads=4):
    doc.is_parsed



In [6]:

    
### Deploying Model on Many Texts with .pipe
runtimes = {}

for thread_count in [1,2,3,4,8]:
    t0 =  datetime.now() 
    
    #Create generator of processed documents
    processed_documents = nlp.pipe(texts,n_threads=thread_count)
    
    #Iterate over generator
    for doc in processed_documents: 
        
        #pipeline is only run once we access the generator
        doc.is_parsed 
    
    t1 = datetime.now()
    runtimes[thread_count] = (t1 - t0).total_seconds()
    
ax = pd.Series(runtimes).plot(kind = 'bar')
ax.set_ylabel("Runtime (Seconds) with N Threads")
plt.show()

Accessing Tokens and Spans



In [8]:

    
import pandas as pd
def info(obj):
    return {'type':type(obj),'__str__': str(obj)}

text = u"""spaCy excels at large-scale information extraction tasks. 
It's written from the ground up in carefully memory-managed Cython. """
document = nlp(text)
token = document[0]
span = document[0:3]

pd.DataFrame(list(map(info, [token,span,document])))









    Out[8]:







  
    
      
      __str__
      type
    
  
  
    
      0
      spaCy
      <class 'spacy.tokens.token.Token'>
    
    
      1
      spaCy excels at
      <class 'spacy.tokens.span.Span'>
    
    
      2
      spaCy excels at large-scale information extrac...
      <class 'spacy.tokens.doc.Doc'>

Sentence boundary detection



In [15]:

    
print(list(document.sents))
print()
for i, sent in enumerate(document.sents):
    print('%2d: "%s"' % (i, sent))









    



[spaCy excels at large-scale information extraction tasks. 
, It's written from the ground up in carefully memory-managed Cython.]

 0: "spaCy excels at large-scale information extraction tasks. 
"
 1: "It's written from the ground up in carefully memory-managed Cython."

Tokenization



In [17]:

    
for i, token in enumerate(document):
    print('%2d: "%s"' % (i, token))









    



 0: "spaCy"
 1: "excels"
 2: "at"
 3: "large"
 4: "-"
 5: "scale"
 6: "information"
 7: "extraction"
 8: "tasks"
 9: "."
10: "
"
11: "It"
12: "'s"
13: "written"
14: "from"
15: "the"
16: "ground"
17: "up"
18: "in"
19: "carefully"
20: "memory"
21: "-"
22: "managed"
23: "Cython"
24: "."

Morphological decomposition



In [18]:

    
token = document[13]
print("text: %s" % token.text)
print("suffix: %s" % token.suffix_)
print("lemma: %s" % token.lemma_)









    



text: written
suffix: ten
lemma: write

Part of Speech Tagging



In [19]:

    
#Part of speech and Dependency tagging
attrs = list(map(lambda token: {
                     "token":token,
                     "part of speech":token.pos_,
                     "Dependency" : token.dep_},
                     document))
pd.DataFrame(attrs)









    Out[19]:







  
    
      
      Dependency
      part of speech
      token
    
  
  
    
      0
      nsubj
      NOUN
      spaCy
    
    
      1
      ROOT
      VERB
      excels
    
    
      2
      prep
      ADP
      at
    
    
      3
      amod
      ADJ
      large
    
    
      4
      punct
      PUNCT
      -
    
    
      5
      compound
      NOUN
      scale
    
    
      6
      compound
      NOUN
      information
    
    
      7
      compound
      NOUN
      extraction
    
    
      8
      pobj
      NOUN
      tasks
    
    
      9
      punct
      PUNCT
      .
    
    
      10
      
      SPACE
      \n
    
    
      11
      nsubjpass
      PRON
      It
    
    
      12
      auxpass
      VERB
      's
    
    
      13
      ROOT
      VERB
      written
    
    
      14
      prep
      ADP
      from
    
    
      15
      det
      DET
      the
    
    
      16
      pobj
      NOUN
      ground
    
    
      17
      prt
      ADV
      up
    
    
      18
      prep
      ADP
      in
    
    
      19
      advmod
      ADV
      carefully
    
    
      20
      npadvmod
      NOUN
      memory
    
    
      21
      punct
      PUNCT
      -
    
    
      22
      dep
      VERB
      managed
    
    
      23
      npadvmod
      PROPN
      Cython
    
    
      24
      punct
      PUNCT
      .

Noun Chunking



In [20]:

    
print("noun chunks: %s" % list(document.noun_chunks))









    



noun chunks: [spaCy, large-scale information extraction tasks, It, the ground]

Named Entity Recognition



In [21]:

    
ents = [(ent, ent.root.ent_type_) for ent in document.ents]
print("entities: %s" % ents)









    



entities: [(Cython, 'ORG')]

Text Similarity (Using Word Vectors)



In [23]:

    
#document, span, and token similarity
def plot_similarities(similarities, target):
    import matplotlib.pyplot as plt
    %matplotlib inline
    f, ax = plt.subplots(1)
    index = range(len(similarities))
    ax.barh(index, similarities)
    ax.set_yticks([i + 0. for i in index])
    ax.set_yticklabels(document2)
    ax.grid(axis='x')
    ax.set_title("Similarity to '{}'".format(target))
    plt.show()
    return ax
    
computer = nlp(u'computer')
document2 = nlp(u'You might be using a machine running Windows')
similarities = list(map(lambda token: token.similarity(computer), document2))
ax = plot_similarities(similarities, computer)

	__str__	type
0	spaCy	<class 'spacy.tokens.token.Token'>
1	spaCy excels at	<class 'spacy.tokens.span.Span'>
2	spaCy excels at large-scale information extrac...	<class 'spacy.tokens.doc.Doc'>

	Dependency	part of speech	token
0	nsubj	NOUN	spaCy
1	ROOT	VERB	excels
2	prep	ADP	at
3	amod	ADJ	large
4	punct	PUNCT	-
5	compound	NOUN	scale
6	compound	NOUN	information
7	compound	NOUN	extraction
8	pobj	NOUN	tasks
9	punct	PUNCT	.
10		SPACE	\n
11	nsubjpass	PRON	It
12	auxpass	VERB	's
13	ROOT	VERB	written
14	prep	ADP	from
15	det	DET	the
16	pobj	NOUN	ground
17	prt	ADV	up
18	prep	ADP	in
19	advmod	ADV	carefully
20	npadvmod	NOUN	memory
21	punct	PUNCT	-
22	dep	VERB	managed
23	npadvmod	PROPN	Cython
24	punct	PUNCT	.